#include "config.h"

#define DBG_SUBSYS S_LIBCLUSTER

#include "lich_api.h"

#include "diskmap.h"

static char *__type__[] = {"null", "list", "disk", "node", "rack", "site"};

typedef enum {
        DISKMAP_NULL = 0,
        DISKMAP_LIST,
        DISKMAP_DISK,
        DISKMAP_NODE,
        DISKMAP_RACK,
        DISKMAP_SITE,
} type_t;

typedef struct __diskmap_entry {
        int count;
        type_t type;
        struct list_head hook;         //hook of same level
        struct list_head parent_hook;  //hook of parent
        struct list_head child_list;   //list for child
        struct list_head *pos;         //use pos of child list
        //struct __diskmap_entry *parent;

        nid_t diskid;
        char *name;
        char *id;
        char buf[0];
} entry_t;

/**
 * 每个__diskmap_entry处于两个list中，
 * 一是parent的child_list（parent_hook), 树中entry类型为具体类型
 * 二是每层的list（hook）, 头entry类型是list
 */
typedef struct {
        int inited;
        sy_rwlock_t lock;
        // root tree
        entry_t cluster; //root of site (level-by-level tree)
        // layer list
        entry_t site;    //list of all site
        entry_t rack;    //list of all rack
        entry_t node;    //list of all node
        entry_t disk;    //list of all disk
} diskmap_t;

#define POOLMAP_ALONE_TIMEOUT 30
#define POOLMAP_TOTAL_TIMEOUT 3

typedef struct {
        struct list_head hook;
        char pool[MAX_NAME_LEN];
        diskmap_t diskmap;

        time_t alone_begin;
        BOOL alone_timeout;
} poolmap_entry_t;

typedef struct {
        sy_rwlock_t lock;
        //struct list_head list;
        count_list_t list;
} poolmap_t;

static poolmap_t *__poolmap__;
static diskmap_t __diskmap__;

static int __poolmap_standalone(poolmap_entry_t *ent);
static int __poolmap_get(poolmap_t *poolmap, const char *pool, poolmap_entry_t **_ent);
static void __diskmap_add(diskmap_t *diskmap, const char *name, const nid_t *nid);
static int __diskmap_skip_add(diskmap_t *diskmap, const nid_t *skip, int count);
static void __diskmap_dump_nolock(const diskmap_t *diskmap, const char *value);

inline static void __diskmap2nid(const diskmap_t *diskmap, nid_t *nid, int *_count)
{
        entry_t *site, *rack, *node;
        int i = 0;

        list_for_each_entry(site, &diskmap->cluster.child_list, parent_hook) {
                DINFO("map %u site:%s --> %s\n", i, site->name, site->id);
                list_for_each_entry(rack, &site->child_list, parent_hook) {
                        DINFO("map %u     rack:%s --> %s\n", i, rack->name, rack->id);
                        list_for_each_entry(node, &rack->child_list, parent_hook) {
                                DINFO("map %u         node:%s --> %s\n", i, node->name, node->id);
                                nid[i] = node->diskid;
                                i++;
                        }
                }
        }

        *_count = i;
}

inline static int __skip_check(const nid_t *skip, int skip_count, const nid_t *nid, int nid_count, const diskmap_t *diskmap)
{
        int ret, _rack_count, total, i, j;
        char rack_array[LICH_REPLICA_MAX][MAX_NAME_LEN], name[MAX_NAME_LEN];

        ret = rack_count(&total);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (total <= 1) {
                goto out;
        }

        ret = nids2racks(rack_array, &_rack_count, skip, skip_count);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        for (i = 0; i < nid_count; i++) {
                ret = nid2rack(&nid[i], name);
                if (unlikely(ret))
                        GOTO(err_ret, ret);

                for (j = 0; j < _rack_count; j++) {
                        if (strcmp(rack_array[j], name) == 0) {
                                if (diskmap) {
                                        __diskmap_dump_nolock(diskmap, "");
                                }

                                YASSERT(0);
                        }
                }
        }

out:
        return 0;
err_ret:
        return ret;
}

static void __diskmap_list_init(entry_t *ent)
{
        INIT_LIST_HEAD(&ent->child_list);
        INIT_LIST_HEAD(&ent->hook);
        INIT_LIST_HEAD(&ent->parent_hook);

        ent->pos = &ent->child_list;
        ent->count = 0;
        ent->type = DISKMAP_LIST;
        ent->id = NULL;

        YASSERT(ent->pos);
}

static void __diskmap_dump_nolock(const diskmap_t *diskmap, const char *value)
{
        const entry_t *site, *rack, *node, *disk;

        list_for_each_entry(site, &diskmap->cluster.child_list, parent_hook) {
                DBUG("map %s site:%s --> %s\n", value, site->name, site->id);
                list_for_each_entry(rack, &site->child_list, parent_hook) {
                        DBUG("map %s     rack:%s --> %s\n", value, rack->name, rack->id);
                        list_for_each_entry(node, &rack->child_list, parent_hook) {
                                DBUG("map %s         node:%s --> %s\n", value, node->name, node->id);
                                list_for_each_entry(disk, &node->child_list, parent_hook) {
                                        DBUG("map %s             disk:%s --> %s\n", value, disk->name, disk->id);
                                }
                        }
                }
        }
}

static int __diskmap_init(diskmap_t *diskmap)
{
        int ret;

        YASSERT(diskmap->inited == 0);

        ret = sy_rwlock_init(&diskmap->lock, "diskmap.lock");
        if (unlikely(ret))
                GOTO(err_ret, ret);

        __diskmap_list_init(&diskmap->cluster);

        __diskmap_list_init(&diskmap->site);

        __diskmap_list_init(&diskmap->rack);

        __diskmap_list_init(&diskmap->node);

        __diskmap_list_init(&diskmap->disk);

        diskmap->inited = 1;

        return 0;
err_ret:
        return ret;
}

int diskmap_init()
{
        return __diskmap_init(&__diskmap__);
}

static entry_t *__diskmap_find_inchild(entry_t *parent, const char *name)
{
        entry_t *ent;

        list_for_each_entry(ent, &parent->child_list, parent_hook) {
               // DINFO("type %u id %s name %s %s\n", ent->type, ent->id, ent->name, name);
                if (strcmp(ent->name, name) == 0) {
                        return ent;
                }
        }

        return NULL;
}

static int __diskmap_list_inchild(entry_t *parent, void *buf, int *_count)
{
        int count = 0;
        entry_t *ent;
        uint32_t buflen;

        list_for_each_entry(ent, &parent->child_list, parent_hook) {
                DBUG("------- type %u id %s name %s -------\n", ent->type, ent->id, ent->name);
                buf = _opaque_encode(buf, &buflen, ent->name, strlen(ent->name) + 1, NULL);
                count += buflen;
        }

        *_count = count;

        return 0;
}

static entry_t *__diskmap_find_inlist(entry_t *parent, const char *name)
{
        entry_t *ent;

        list_for_each_entry(ent, &parent->child_list, hook) {
                DINFO("type %u id %s name %s %s\n", ent->type, ent->id, ent->name, name);
                if (strcmp(ent->id, name) == 0) {
                        return ent;
                }
        }

        return NULL;
}

/** 通过移动pos来实现轮询
 *
 * @param parent
 * @return
 */
static entry_t *__diskmap_get_inchild0(entry_t *parent)
{
        int i, rand;
        entry_t *ent;

        YASSERT(parent->pos);

        if (parent->count > 5) {
                if (parent->pos == &parent->child_list) {
                        parent->pos = parent->pos->next;
                }

                ent = list_entry(parent->pos, entry_t, parent_hook);
                parent->pos = parent->pos->next;
        } else {
                rand = _random() % (parent->count);
                /*DINFO("parent->id: %s, rand: %d, count: %d\n", parent->id, rand, parent->count);*/

                parent->pos = (&parent->child_list)->next;
                for (i = 0; i < rand; i++) {
                        parent->pos = parent->pos->next;
                }

                ent = list_entry(parent->pos, entry_t, parent_hook);
        }

        YASSERT(parent->pos);

        return ent;
}

static entry_t *__diskmap_get_inlist(entry_t *list)
{
        entry_t *ent;

        YASSERT(list->pos);

        if (list->pos == &list->child_list)
                list->pos = list->pos->next;

        ent = list_entry(list->pos, entry_t, hook);

        list->pos = list->pos->next;

        YASSERT(list->pos);

        return ent;
}

/**
 *
 * @param _ent
 * @param _count 下属的磁盘数量
 */
static void __diskmap_count(entry_t *_ent, int *_count)
{
        entry_t *ent;
        int count = 0;

        YASSERT(_ent->type != DISKMAP_DISK);

        if (_ent->type == DISKMAP_NODE) {
                list_for_each_entry(ent, &_ent->child_list, parent_hook) {
                        count++;
                }

                *_count += count;
        } else {
                if (_ent->type == DISKMAP_LIST) {
                        list_for_each_entry(ent, &_ent->child_list, hook) {
                                __diskmap_count(ent, _count);
                        }
                } else {
                        list_for_each_entry(ent, &_ent->child_list, parent_hook) {
                                __diskmap_count(ent, _count);
                        }
                }
        }

        // DINFO("type %d count %d\n", _ent->type, *_count);
}

static int __diskmap_exist(const char *id, entry_t *_ent, int *count)
{
        //struct list_head *pos;
        entry_t *ent;
        int i;

        DBUG("test %s @ %s\n", id, _ent->id);

        i = 0;
        list_for_each_entry(ent, &_ent->child_list, hook) {//(pos, &_ent->child_list) {
                //ent = (void *)pos;
                DBUG("cmp[%u] %s --> %s type %s\n", i, id, ent->id, __type__[ent->type]);
                if (strcmp(ent->id, id) == 0) {
                        if (count) {
                                if (ent->type == DISKMAP_DISK) {
                                        *count = 1;
                                } else {
                                        *count = 0;
                                        __diskmap_count(ent, count);
                                }
                        }

                        DBUG("%s exist\n", id);

                        return 1;
                }

                i++;
        }

        DBUG("%s not exist\n", id);

        return 0;
}

typedef struct {
        entry_t *ent;
        int count;
} entry_array_t;

static int __diskmap_rc_cmp(const void *arg1, const void *arg2)
{
        const entry_array_t *sec1 = arg1, *sec2 = arg2;
        return sec1->count - sec2->count;
}

static void __diskmap_entry_check(const entry_t *ent)
{
        YASSERT(ent->type > DISKMAP_NULL && ent->type <= DISKMAP_SITE);
}

static entry_t *__diskmap_get_inchild(entry_t *parent, entry_t *skip)
{
        int i, count, rc, ents_count, ent_exist;
        entry_t *ent;
        entry_t *ents[512];
        entry_array_t array[512];

        ANALYSIS_BEGIN(0);

        rc = 0;
        ents_count = 0;
        while (ents_count < parent->count) {
                ent  =__diskmap_get_inchild0(parent);
                ent_exist = 0;
                for (i = 0; i < ents_count; i++) {
                        if (strcmp(ents[i]->id, ent->id) == 0) {
                                ent_exist = 1;
                        }
                }

                if (ent_exist) {
                        continue;
                } else {
                        ents[ents_count] = ent;
                        ents_count++;
                }

                if (__diskmap_exist(ent->id, skip, &count)) {
#if 0
                        // TODO 单rack无法工作
                        continue;
#else
                        DBUG("parent.type %d parent.count %d count %d\n",
                             parent->type, parent->count, count);

                        if ((parent->type == DISKMAP_RACK && !nodetable_standalone())
                                        || (parent->type == DISKMAP_SITE && parent->count != 1)
                                        || parent->type == DISKMAP_NODE) {
                                DBUG("%s exist\n", ent->id);
                                continue;
                        } else {
                                array[rc].ent = ent;
                                array[rc].count = count;
                                DBUG("rc %s exist, count %u\n", ent->id, count);
                                rc++;
                                continue;
                        }
#endif
                }

                __diskmap_entry_check(ent);
                return ent;
        }

        ANALYSIS_END(0, 1000 * 100, NULL);

        if (rc) {
                ARRAY_SORT(array, rc, __diskmap_rc_cmp);

                __diskmap_entry_check(array[0].ent);
                return array[0].ent;
        } else
                return NULL;
}

static int __diskmap_get_bydisk(diskmap_t *diskmap, diskid_t *diskid, int *_count, diskmap_t *skip)
{
        int ret, i, count;
        entry_t *ent;

        ANALYSIS_BEGIN(0);

        if (!nodetable_standalone()) {
                //__diskmap_dump_nolock(diskmap, "diskmap");
                //__diskmap_dump_nolock(skip, "skip");
                ret = ENOSPC;
                GOTO(err_ret, ret);
        }

        if (diskmap->disk.count < *_count + skip->disk.count) {
                count = diskmap->disk.count - skip->disk.count;

                if (count <= 0) {
                        ret = ENOSPC;
                        DWARN("total %u skip %u need %u (%d) %s\n", diskmap->disk.count,
                              skip->disk.count, *_count, ret, strerror(ret));
                        //__diskmap_dump_nolock(diskmap, "diskmap");
                        //__diskmap_dump_nolock(skip, "skip");
                        GOTO(err_ret, ret);
                }
        } else {
                count = *_count;
        }

        for (i = 0; i < count; i++) {
                while (1) {
                        ent = __diskmap_get_inlist(&diskmap->disk);//get first disk from disk list
                        if (__diskmap_exist(ent->id, &skip->disk, NULL)) {
                                continue;
                        } else
                                break;
                }

                diskid[i] = ent->diskid;
                YASSERT(!net_isnull(&diskid[i]));
                ret = __diskmap_skip_add(skip, &ent->diskid, 1);
                if (ret)
                        GOTO(err_ret, ret);
        }

        *_count = count;

        metadata_check_diskid(diskid, *_count);

        ANALYSIS_END(0, 1000 * 100, NULL);

        return 0;
err_ret:
        return ret;
}

static int __diskmap_skip_add(diskmap_t *diskmap, const nid_t *skip, int count)
{
        int ret, i;
        char name[MAX_NAME_LEN];

        for (i = 0; i < count; i++) {
#if 0
                if (!netable_connected(&skip[i]))
                        continue;
#endif

                ret = network_rname1(&skip[i], name);
                if (ret)
                        GOTO(err_ret, ret);

                __diskmap_add(diskmap, name, &skip[i]);
        }

        return 0;
err_ret:
        return ret;
}

static int __diskmap_skip(diskmap_t *diskmap, const nid_t *skip, int count)
{
        int ret;

        diskmap->inited = 0;

        ret = __diskmap_init(diskmap);

        YASSERT(ret == 0);

        ret = __diskmap_skip_add(diskmap, skip, count);
        if (ret)
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

static void __diskmap_rack_skip_add(diskmap_t *diskmap, const char *rackname,
                                    const nid_t *skip, int count)
{
        int ret, i;
        char name[MAX_NAME_LEN];
        char _rack[MAX_NAME_LEN], _node[MAX_NAME_LEN], _disk[MAX_NAME_LEN],
                _site[MAX_NAME_LEN], buf[MAX_NAME_LEN];

        for (i = 0; i < count; i++) {
                if (!netable_connected(&skip[i]))
                        continue;

                ret = network_rname1(&skip[i], name);
                if (ret)
                        continue;

                hosts_split(name, _site, _rack, _node, _disk);

                snprintf(buf, MAX_BUF_LEN, "%s.%s", _site, _rack);
                if (strcmp(rackname, buf) != 0)
                        continue;

                __diskmap_add(diskmap, name, &skip[i]);
        }
}

inline static void __diskmap_rack_skip(diskmap_t *diskmap, const char *rackname,
                                const nid_t *skip, int count)
{
        int ret;

        diskmap->inited = 0;

        ret = __diskmap_init(diskmap);

        YASSERT(ret == 0);

        __diskmap_rack_skip_add(diskmap, rackname, skip, count);
}

inline static int __diskmap_get_bynode(diskmap_t *diskmap, diskid_t *diskid,
                                int *_count, diskmap_t *skip)
{
        int ret, i, newcount, count, left;
        entry_t *node, *disk;

        ANALYSIS_BEGIN(0);

        count = *_count;
        if (diskmap->node.count < count + skip->node.count) {
                newcount = diskmap->node.count - skip->node.count;
                if (newcount > 0) {
                        ret = __diskmap_get_bynode(diskmap, diskid, &newcount, skip);
                        if (unlikely(ret)) {
                                GOTO(err_ret, ret);
                        }

                        //__diskmap_skip_add(skip, diskid, newcount);

                        left = count - newcount;
                        ret = __diskmap_get_bydisk(diskmap, &diskid[newcount],
                                                   &left, skip);
                        if (unlikely(ret)) {
                                GOTO(err_ret, ret);
                        }

                        *_count = newcount + left;
                } else {
                        ret = __diskmap_get_bydisk(diskmap, diskid, &count, skip);
                        if (unlikely(ret)) {
                                GOTO(err_ret, ret);
                        }

                        *_count = count;
                }
        } else {
                for (i = 0; i < count; i++) {
                        while (1) {
                                node = __diskmap_get_inlist(&diskmap->node);//get first node from node list
                                if (__diskmap_exist(node->id, &skip->node, NULL)) {
                                        continue;
                                }

                                disk = __diskmap_get_inchild(node, &skip->disk);
                                if (disk == NULL) {
                                        YASSERT(0);
                                        continue;
                                }

                                break;
                        }


                        diskid[i] = disk->diskid; //get first disk from node


                        YASSERT(!net_isnull(&diskid[i]));
                        ret = __diskmap_skip_add(skip, &disk->diskid, 1);
                        if (unlikely(ret))
                                GOTO(err_ret, ret);
                }

                *_count = count;
        }

        for (i = 0; i < *_count; i++) {
                YASSERT(!net_isnull(&diskid[i]));
        }

        metadata_check_diskid(diskid, *_count);

        ANALYSIS_END(0, 1000 * 100, NULL);

        return 0;
err_ret:
        return ret;
}

static int  __diskmap_get_insite__(poolmap_entry_t *ent, diskid_t *diskid, const char *site_name, int *_count, diskmap_t *skip)
{
        int ret, i, count, retry;
        entry_t *rack, *node, *disk, *site;
        diskmap_t *diskmap = &ent->diskmap;

#if ENABLE_CHUNK_DEBUG
        int skip_count;
        nid_t skip_nid[LICH_REPLICA_MAX];

        __diskmap2nid(skip, skip_nid, &skip_count);
#endif

        ANALYSIS_BEGIN(0);

        count = *_count;
        for (i = 0; i < count; i++) {
                retry = 0;
                while (1) {
                        site = __diskmap_find_inchild(&diskmap->cluster, site_name);
                        if (site == NULL) {
                                ret = ENOSPC;
                                GOTO(err_ret, ret);
                        }

                        rack = __diskmap_get_inchild(site, &skip->rack);//get first rack from site list
                        if (rack == NULL) {
                                goto out;
                        }

                        node = __diskmap_get_inchild(rack, &skip->node);//get first node from rack
                        if (node == NULL) {
                                goto out;
                        }

                        disk = __diskmap_get_inchild(node, &skip->disk);
                        if (disk == NULL) {
                                if (retry < diskmap->disk.count) {
                                        DBUG("no disk available retry %u\n", retry);
                                        retry ++;
                                        continue;
                                }

                                if (i == 0) {
                                        ret = ENOSPC;
                                        GOTO(err_ret, ret);
                                } else {
                                        DBUG("%s\n", site_name);
                                        goto out;
                                }
                        } else {
                                DBUG("got disk%s, skip %u\n", network_rname(&disk->diskid), skip->disk.count);

                                YASSERT(disk->type == DISKMAP_DISK);
                                diskid[i] = disk->diskid; //get first disk from node
                                YASSERT(!net_isnull(&diskid[i]));
                                ret = __diskmap_skip_add(skip, &disk->diskid, 1);
                                if (unlikely(ret))
                                        GOTO(err_ret, ret);

                                break;
                        }
                }
        }

out:
        *_count = i;

        if (*_count == 0) {
                DWARN("pool %s no space\n", ent->pool);
                __diskmap_dump_nolock(&ent->diskmap, ent->pool);
                __diskmap_dump_nolock(skip, "skip");
                ret = ENOSPC;
                GOTO(err_ret, ret);
        }
        
        metadata_check_diskid(diskid, *_count);

        for (i = 0; i < *_count; i++) {
                YASSERT(!net_isnull(&diskid[i]));
        }

#if ENABLE_CHUNK_DEBUG
        __skip_check(skip_nid, skip_count, diskid, *_count, &ent->diskmap);
        rack_check(diskid, *_count);
#endif

        ANALYSIS_END(0, 1000 * 100, NULL);

        return 0;
err_ret:
        return ret;
}

static void __diskmap_del_entry(diskmap_t *diskmap, entry_t *list, entry_t *parent, entry_t *ent)
{
        list_del(&ent->hook);
        list_del(&ent->parent_hook);

        parent->count--;
        list->count--;

        list->pos = &list->child_list;
        parent->pos = &parent->child_list;

        YASSERT(parent->pos);
        YASSERT(list->pos);

        if (diskmap == &__diskmap__)
                DINFO("del %s %s, parent %u  list %u\n", __type__[ent->type],
                      ent->name, parent->count, list->count);
}

static void __diskmap_del_tree(diskmap_t *diskmap, const char *name)
{
        entry_t *rack, *node, *disk, *site;
        char _rack[MAX_NAME_LEN], _node[MAX_NAME_LEN], _disk[MAX_NAME_LEN],
                _site[MAX_NAME_LEN];

        hosts_split(name, _site, _rack, _node, _disk);

        site = __diskmap_find_inchild(&diskmap->cluster, _site);
        if (site == NULL) {
                DERROR("size %s already removed\n", name);
                return;
        }

        rack = __diskmap_find_inchild(site, _rack);
        if (rack == NULL) {
                DERROR("rack %s already removed\n", name);
                return;
        }

        node = __diskmap_find_inchild(rack, _node);
        if (node == NULL) {
                DERROR("node %s already removed\n", name);
                return;
        }

        disk = __diskmap_find_inchild(node, _disk);
        YASSERT(disk);

        __diskmap_del_entry(diskmap, &diskmap->disk, node, disk);

        if (list_empty(&node->child_list)) {
                __diskmap_del_entry(diskmap, &diskmap->node, rack, node);
                if (list_empty(&rack->child_list)) {
                        __diskmap_del_entry(diskmap, &diskmap->rack, site, rack);
                        yfree1((void **)&rack);
                        if (list_empty(&site->child_list)) {
                                __diskmap_del_entry(diskmap, &diskmap->site, &diskmap->cluster, site);
                                yfree1((void **)&site);
                        }
                }

                yfree1((void **)&node);
        }

        yfree1((void **)&disk);
}

static void __diskmap_destroy(diskmap_t *diskmap)
{
        int ret;
        //struct list_head *pos, *n;
        entry_t *disk, *n;

        ret = sy_rwlock_wrlock(&diskmap->lock);
        if (unlikely(ret))
                UNIMPLEMENTED(__DUMP__);

        list_for_each_entry_safe(disk, n, &diskmap->disk.child_list, hook) {
                __diskmap_del_tree(diskmap, disk->id);
        }

        diskmap->inited = 0;

        sy_rwlock_unlock(&diskmap->lock);

}

void diskmap_destroy()
{
        __diskmap_destroy(&__diskmap__);
}

int __diskmap_get_insite(poolmap_entry_t *ent, diskid_t *diskid, int *_count, const char *site_name,
                const diskid_t *_skip, int skip_count, int force)
{
        int ret, need;
        diskmap_t skip;

        ANALYSIS_BEGIN(0);

        need = *_count;
        ret = __diskmap_skip(&skip, _skip, skip_count);
        if (unlikely(ret))
                GOTO(err_ret, ret);

#if 0
        if (diskmap->node.count == 1) {
                if (!__diskmap_standalone()) {
                        ret = ENOSPC;
                        GOTO(err_skip, ret);
                }
        }
#endif

        ret = __diskmap_get_insite__(ent, diskid, site_name, _count, &skip);
        if (unlikely(ret)) {
                GOTO(err_skip, ret);
        }

        if (force && need != *_count) {
                ret = ENOSPC;
                GOTO(err_skip, ret);
        }

        if (*_count == 0) {
                DWARN("pool %s no space\n", ent->pool);
                __diskmap_dump_nolock(&ent->diskmap, ent->pool);
                __diskmap_dump_nolock(&skip, "skip");
                ret = ENOSPC;
                GOTO(err_ret, ret);
        }
        
        ANALYSIS_END(0, 1000 * 10, NULL);
        __diskmap_destroy(&skip);

        return 0;
err_skip:
        __diskmap_destroy(&skip);
err_ret:
        return ret;
}

static int __diskmap_get__(poolmap_entry_t *ent, diskid_t *diskid, int *_count,
                const diskid_t *_skip, int skip_count, int force)
{
        int ret, need;
        entry_t *site;
        diskmap_t skip;
        int site_names_count = 0, i, find, ok;
        char **site_names;
        diskmap_t *diskmap = &ent->diskmap;

        (void)force;
        ANALYSIS_BEGIN(0);

        need = *_count;
        ret = __diskmap_skip(&skip, _skip, skip_count);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (diskmap->site.count == 0) {
                ret = ENOSPC;
                GOTO(err_skip, ret);
        }

        ret = ymalloc((void **)&site_names, sizeof(char *) * diskmap->site.count);
        if (ret)
                GOTO(err_skip, ret);

        ok = 0;
        while (site_names_count < diskmap->site.count) {
                site = __diskmap_get_inchild(&diskmap->cluster, &skip.site);//get first site from cluster list
                if (site == NULL) {
                        ret = ENOSPC;
                        GOTO(err_free, ret);
                }

                find = 0;
                for (i = 0; i < site_names_count; i++) {
                        if (strcmp(site->id, site_names[i]) == 0) {
                                find = 1;
                                break;
                        }
                }

                if (find) {
                        continue;
                } else {
                        site_names[site_names_count++] = site->id;
                }

                *_count = need;
                ret = __diskmap_get_insite(ent, diskid, _count, site->id, _skip, skip_count, force);
                if (ret) {
                        continue;
                } else {
                        ok = 1;
                        break;
                }
        }

        if (!ok) {
                ret = ENOSPC;
                GOTO(err_free, ret);
        }

        if (force && need != *_count) {
                ret = ENOSPC;
                GOTO(err_free, ret);
        }

        if (*_count == 0) {
                DWARN("pool %s no space\n", ent->pool);
                __diskmap_dump_nolock(&ent->diskmap, ent->pool);
                __diskmap_dump_nolock(&skip, "skip");
                ret = ENOSPC;
                GOTO(err_ret, ret);
        }
        
        ANALYSIS_END(0, 1000 * 10, NULL);

        yfree((void **)&site_names);
        __diskmap_destroy(&skip);

        return 0;
err_free:
        yfree((void **)&site_names);
err_skip:
        __diskmap_destroy(&skip);
err_ret:
        return ret;
}

static int __diskmap_get(poolmap_entry_t *ent, diskid_t *diskid, int *_count, const diskid_t *_skip, int skip_count, int force)
{
        int ret, need;
        diskmap_t *diskmap = &ent->diskmap;

        need = *_count;
        ret = sy_rwlock_wrlock(&diskmap->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = __diskmap_get__(ent, diskid, _count, _skip, skip_count, force);
        if (unlikely(ret))
                GOTO(err_lock, ret);

        if (force && need != *_count) {
                ret = ENOSPC;
                GOTO(err_lock, ret);
        }

        sy_rwlock_unlock(&diskmap->lock);

        return 0;
err_lock:
        sy_rwlock_unlock(&diskmap->lock);
err_ret:
        return ret;
}

int diskmap_get(const char *pool, diskid_t *diskid, int repnum,
                const nid_t *skip, int skip_count, int flag)
{
        int ret, need;
        poolmap_entry_t *ent;

        ret = __poolmap_get(__poolmap__, pool, &ent);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (__poolmap_standalone(ent)) {
                ret = ENOSPC;
                GOTO(err_ret, ret);
        }

        need = repnum;
        ret = __diskmap_get(ent, diskid, &need, skip, skip_count, flag);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (need != repnum) {
                ret = ENOSPC;
                GOTO(err_ret, ret);
        }
        
        return 0;
err_ret:
        return ret;
}

static entry_t *__diskmap_new_entry(diskmap_t *diskmap, entry_t *parent,
                                    entry_t *list, const char *id, const char *name, type_t type)
{
        int ret;
        entry_t *ent;

        ent = __diskmap_find_inchild(parent, name);
        if (ent)
                return ent;

        ret = ymalloc1((void **)&ent, sizeof(*ent) + strlen(name) + 1 + strlen(id) + 1);
        if (unlikely(ret))
                UNIMPLEMENTED(__DUMP__);

        ent->type = type;
        ent->name = ent->buf;
        ent->id = ent->buf + strlen(name) + 1;
        strcpy(ent->name, name);
        strcpy(ent->id, id);
        INIT_LIST_HEAD(&ent->child_list);
        INIT_LIST_HEAD(&ent->parent_hook);
        INIT_LIST_HEAD(&ent->hook);

        list_add(&ent->parent_hook, &parent->child_list);
        list_add(&ent->hook, &list->child_list);

        //ent->parent = parent;
        ent->pos = &ent->child_list;
        parent->count++;
        list->count++;

        YASSERT(ent->pos);

        if (diskmap == &__diskmap__)
                DINFO("add %s %s, parent %u  list %u\n", __type__[type],
                      name, parent->count, list->count);

        return ent;
}

/** add node to diskmap
 *
 * @param diskmap
 * @param name hostname
 * @param nid
 */
static void __diskmap_add(diskmap_t *diskmap, const char *name, const nid_t *nid)
{
        int ret;
        entry_t *rack, *node, *disk, *site;
        char _rack[MAX_NAME_LEN], _node[MAX_NAME_LEN], _site[MAX_NAME_LEN],
                _disk[MAX_NAME_LEN], buf[MAX_BUF_LEN];

        YASSERT(!net_isnull(nid));

        hosts_split(name, _site, _rack, _node, _disk);
        DBUG("name: %s, _site: %s, _rack: %s, _node: %s, _disk: %s\n",
                        name, _site, _rack, _node, _disk);

        ret = sy_rwlock_wrlock(&diskmap->lock);
        if (unlikely(ret))
                UNIMPLEMENTED(__DUMP__);

        site = __diskmap_new_entry(diskmap, &diskmap->cluster,
                                   &diskmap->site , _site, _site, DISKMAP_SITE);

        snprintf(buf, MAX_BUF_LEN, "%s.%s", _site, _rack);
        rack = __diskmap_new_entry(diskmap, site, &diskmap->rack,
                                   buf, _rack, DISKMAP_RACK);

        snprintf(buf, MAX_BUF_LEN, "%s.%s.%s", _site, _rack, _node);
        node = __diskmap_new_entry(diskmap, rack, &diskmap->node,
                                   buf, _node, DISKMAP_NODE);

        node->diskid = *nid;
        snprintf(buf, MAX_BUF_LEN, "%s.%s.%s/%s", _site, _rack, _node, _disk);
        disk = __diskmap_new_entry(diskmap, node, &diskmap->disk,
                                   buf, _disk, DISKMAP_DISK);

        disk->diskid = *nid;

        //__diskmap_dump_nolock(diskmap, "diskmap");

        sy_rwlock_unlock(&diskmap->lock);

}

void diskmap_add(const char *name, const nid_t *nid)
{
        DINFO("add %s\n", name);

        __diskmap_add(&__diskmap__, name, nid);
}

static void __diskmap_del(diskmap_t *diskmap, const char *name)
{
        int ret;

        ret = sy_rwlock_wrlock(&diskmap->lock);
        if (unlikely(ret))
                UNIMPLEMENTED(__DUMP__);

        __diskmap_del_tree(diskmap, name);

        //__diskmap_dump_nolock(diskmap, "diskmap");

        sy_rwlock_unlock(&diskmap->lock);

}

void diskmap_del(const char *name)
{
        DINFO("del %s\n", name);
        __diskmap_del(&__diskmap__, name);
}

/*
int diskmap_getrack(const nid_t *nid, char *rack)
{
        int ret;
        char _rack[MAX_NAME_LEN], _node[MAX_NAME_LEN], _disk[MAX_NAME_LEN],
                _site[MAX_NAME_LEN], name[MAX_NAME_LEN];

        network_connect(nid, NULL, 1, 0);

        if (netable_connected(nid)) {
                ret = network_rname1(nid, name);
                if (unlikely(ret))
                        GOTO(err_ret, ret);
                
                hosts_split(name, _site, _rack, _node, _disk);
                snprintf(rack, MAX_BUF_LEN, "%s.%s", _site, _rack);
        } else {
                ret = ENONET;
                GOTO(err_ret, ret);
        }

        return 0;
err_ret:
        return ret;
}
*/

int diskmap_total(uint32_t *site, uint32_t *rack, uint32_t *node, uint32_t *disk)
{
        int ret, delen, done = 0;
        char de0[MAX_BUF_LEN];
        struct dirent *de;
        uint64_t offset = 0, offset2 = 0;
        diskmap_t total;
        nid_t nid;
        uuid_t _uuid;
        char uuid[MAX_NAME_LEN] = {};

        memset(&total, 0x0, sizeof(total));

        ret = __diskmap_init(&total);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        uuid_generate(_uuid);
        uuid_unparse(_uuid, uuid);

        ret = nodetable_list_open(uuid);
        if (ret)
                GOTO(err_distroy, ret);

        while (done == 0) {
                delen = MAX_BUF_LEN;
                memset(de0, 0, sizeof(de0));
                ret = nodetable_list(de0, &delen, uuid, offset);
                if (unlikely(ret))
                        GOTO(err_close, ret);

                if (delen == 0)
                        break;

                nid.id = 1;
                offset2 = 0;
                dir_for_each(de0, delen, de, offset2) {
                        if (strlen(de->d_name) == 0) {
                                done = 1;
                                break;
                        } else if (delen - offset2 < sizeof(*de) + MAX_NAME_LEN)
                                break;

                        offset += de->d_reclen;
                        nid.id++;
                        DBUG("disk %s\n", de->d_name);
                        __diskmap_add(&total, de->d_name, &nid);
                }
        }

        *site = total.site.count;
        *rack = total.rack.count;
        *node = total.node.count;
        *disk = total.disk.count;

        DBUG("%u %u %u %u\n", *site, *rack, *node, *disk);
        nodetable_list_close(uuid);
        __diskmap_destroy(&total);

        return 0;
err_close:
        nodetable_list_close(uuid);
err_distroy:
        __diskmap_destroy(&total);
err_ret:
        return ret;
}

int diskmap_total_cached(time_t now, uint32_t *site, uint32_t *rack, uint32_t *node, uint32_t *disk)
{
        int ret;
        static time_t last = 0;
        static uint32_t s, r, n, d;

        if (last == 0 || now - last > POOLMAP_TOTAL_TIMEOUT) {
                ret = diskmap_total(&s, &r, &n, &d);
                if (unlikely(ret))
                        GOTO(err_ret, ret);

                last = now;
        }

        *site = s;
        *rack = r;
        *node = n;
        *disk = d;

        return 0;
err_ret:
        return ret;
}

int diskmap_online(uint32_t *site, uint32_t *rack, uint32_t *node, uint32_t *disk)
{
        *site = __diskmap__.site.count;
        *rack = __diskmap__.rack.count;
        *node = __diskmap__.node.count;
        *disk = __diskmap__.disk.count;

        DBUG("%u %u %u %u\n", *site, *rack, *node, *disk);

        return 0;
}

int diskmap_check_insite(const char *site_name)
{
        int ret;
        diskmap_t *diskmap = &__diskmap__;
        entry_t *site = NULL;

        ret = sy_rwlock_rdlock(&diskmap->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        site = __diskmap_find_inchild(&diskmap->cluster, site_name);
        if (site == NULL) {
                ret = ENOENT;
                GOTO(err_lock, ret);
        }

        sy_rwlock_unlock(&diskmap->lock);

        return 0;
err_lock:
        sy_rwlock_unlock(&diskmap->lock);
err_ret:
        return ret;
}

int diskmap_list_insite(void *buf, int *count)
{
        int ret;
        diskmap_t *diskmap = &__diskmap__;

        ret = sy_rwlock_rdlock(&diskmap->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = __diskmap_list_inchild(&diskmap->cluster, buf, count);
        if (unlikely(ret))
                GOTO(err_lock, ret);

        sy_rwlock_unlock(&diskmap->lock);

        return 0;
err_lock:
        sy_rwlock_unlock(&diskmap->lock);
err_ret:
        return ret;
}

static int __poolmap_standalone(poolmap_entry_t *ent)
{
        int ret;
        time_t now;
        uint32_t site, rack, node, disk;

        diskmap_t *diskmap;

        if (ent->alone_timeout)
                goto out;

        if (ent->alone_begin == 0) {
                return 1;
        }

        now = gettime();
        if (now - ent->alone_begin > POOLMAP_ALONE_TIMEOUT) {
                ent->alone_timeout = TRUE;
                goto out;
        }

        diskmap = &ent->diskmap;
        if (diskmap->rack.count == 0) {
                return 1;
        } else if (diskmap->rack.count == 1) {
                ret = diskmap_total_cached(now, &site, &rack, &node, &disk);
                if (unlikely(ret)) {
                        return 1;
                }

                if (rack == 0) {
                        return 1;
                } else if (rack == 1) {
                        ent->alone_timeout = TRUE;
                } else {
                        return 1;
                }
        } else {
                ent->alone_timeout = TRUE;
        }

out:
        return 0;
}

static int __poolmap_find(poolmap_t *poolmap, const char *pool, poolmap_entry_t **_ent)
{
        int ret, found = 0;
        struct list_head *pos;
        poolmap_entry_t *ent;

        ret = sy_rwlock_rdlock(&poolmap->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        list_for_each(pos, &poolmap->list.list) {
                ent = (poolmap_entry_t *)pos;

                if (!strcmp(ent->pool, pool)) {
                        found = 1;
                        break;
                }
        }

        sy_rwlock_unlock(&poolmap->lock);

        if (found)
                *_ent = ent;
        else
                *_ent = NULL;

        return 0;
err_ret:
        return ret;
}

static int __poolmap_create(poolmap_t *poolmap, const char *pool, poolmap_entry_t **_ent)
{
        int ret;
        poolmap_entry_t *ent;

        ret = ymalloc((void **)&ent, sizeof(*ent));
        if (unlikely(ret))
                GOTO(err_ret, ret);

        memset(ent, 0x0, sizeof(*ent));

        strcpy(ent->pool, pool);
        __diskmap_init(&ent->diskmap);

        ret = sy_rwlock_wrlock(&poolmap->lock);
        if (unlikely(ret))
                GOTO(err_free, ret);

        count_list_add_tail(&ent->hook, &poolmap->list);

        sy_rwlock_unlock(&poolmap->lock);

        if (_ent)
                *_ent = ent;

        return 0;
err_free:
        yfree((void **)&ent);
err_ret:
        return ret;
}

static int __poolmap_get(poolmap_t *poolmap, const char *pool, poolmap_entry_t **_ent)
{
        int ret;
        poolmap_entry_t *ent;

        ret = __poolmap_find(poolmap, pool, &ent);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        if (!ent) {
                ret = __poolmap_create(poolmap, pool, &ent);
                if (unlikely(ret))
                        GOTO(err_ret, ret);
        }

        if (_ent)
                *_ent = ent;

        return 0;
err_ret:
        return ret;
}

int poolmap_init()
{
        int ret;
        poolmap_t *poolmap;

        ret = ymalloc((void **)&poolmap, sizeof(*poolmap));
        if (unlikely(ret))
                GOTO(err_ret, ret);

        ret = sy_rwlock_init(&poolmap->lock, "poolmap.lock");
        if (unlikely(ret))
                GOTO(err_free, ret);

        count_list_init(&poolmap->list);

        __poolmap__ = poolmap;

        return 0;
err_free:
        yfree((void **)&poolmap);
err_ret:
        return ret;
}

int poolmap_destroy()
{
        int ret;
        struct list_head *pos, *n;
        poolmap_entry_t *ent;
        poolmap_t *poolmap = __poolmap__;

        ret = sy_rwlock_rdlock(&poolmap->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        list_for_each_safe(pos, n, &poolmap->list.list) {
                ent = (poolmap_entry_t *)pos;

                __diskmap_destroy(&ent->diskmap);

                count_list_del(&ent->hook, &poolmap->list);
                yfree((void **)&ent);
        }

        sy_rwlock_unlock(&poolmap->lock);

        return 0;
err_ret:
        return ret;
}

int poolmap_insert(const char *name, const char *pool, const nid_t *nid)
{
        int ret;
        poolmap_entry_t *ent;

        ret = __poolmap_get(__poolmap__, pool, &ent);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        DINFO("diskmap insert pool %s host %s nid %u\n", pool, name, nid->id);

        /*
         * for diskmap report slow when lich restart.
         * there have many racks,but only one report. then all replicas will dipatch in that rack.
         * wait 30s, if also only one rack, so just one rack.
         */
        if (ent->alone_begin == 0) {
                ent->alone_begin = gettime();
        }

        __diskmap_add(&ent->diskmap, name, nid);

        return 0;
err_ret:
        return ret;
}

int poolmap_remove(const char *name, const char *pool)
{
        int ret;
        poolmap_entry_t *ent;

        ret = __poolmap_get(__poolmap__, pool, &ent);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        DINFO("diskmap remove pool %s host %s\n", pool, name);

        __diskmap_del(&ent->diskmap, name);

        return 0;
err_ret:
        return ret;
}

int poolmap_iterate_entries(int (*handler)(void *, void *), void *arg)
{
        int ret;
        struct list_head *pos;

        ret = sy_rwlock_rdlock(&__poolmap__->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        list_for_each(pos, &__poolmap__->list.list) {
                ret = (*handler)((poolmap_entry_t *)pos, arg);
                if (unlikely(ret))
                        GOTO(err_lock, ret);
        }

        sy_rwlock_unlock(&__poolmap__->lock);

        return 0;
err_lock:
        sy_rwlock_unlock(&__poolmap__->lock);
err_ret:
        return ret;
}

int poolmap_brief_info(int *count, brief_info_t **infos)
{
        int ret, n, rackcount, _count;
        entry_t *rackent;
        poolmap_entry_t *poolent;
        struct list_head *pos, *pos1;
        brief_info_t *_infos;

        *count = 0;
        *infos = NULL;

        ret = sy_rwlock_rdlock(&__poolmap__->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        _count = __poolmap__->list.count;
        if (_count == 0)
                goto out;

        ret = ymalloc((void **)&_infos, sizeof(brief_info_t) * _count);
        if (unlikely(ret))
                GOTO(err_lock, ret);

        n = 0;
        list_for_each(pos, &__poolmap__->list.list) {
                poolent = (poolmap_entry_t *)pos;

                strcpy(_infos[n].pool, poolent->pool);
                _infos[n].writeable_node = poolent->diskmap.node.count;

                rackcount =0;
                list_for_each(pos1, &poolent->diskmap.rack.child_list) {
                        rackent = (entry_t *)pos1;
                        if (!list_empty(&rackent->child_list)) {
                                rackcount ++;
                        }

                }
                _infos[n].writeable_rack = rackcount;
                n ++;
        }

        YASSERT(n == _count);

        *count = _count;
        *infos = _infos;

out:
        sy_rwlock_unlock(&__poolmap__->lock);

        return 0;
err_lock:
        sy_rwlock_unlock(&__poolmap__->lock);
err_ret:
        return ret;
}

int poolmap_dump(const char *pool)
{

        int ret;
        poolmap_entry_t *ent;

        ret = __poolmap_get(__poolmap__, pool, &ent);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        __diskmap_dump_nolock(&ent->diskmap, pool);

        return 0;
err_ret:
        return ret;
}
